01_clean_data.py
import pandas as pd

# Load dataset
df = pd.read_csv(r"data/raw/regional_sales_extended.csv")

# ---- BASIC INSPECTION ----
print("Columns:", df.columns.tolist())
print(f"Total records: {len(df)}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Regions: {df['Region'].unique()}")

# ---- DATA CLEANING ----
# Convert Date column to datetime
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Remove rows with missing critical values
df = df.dropna(subset=["Date", "Sales", "Region"])

# Ensure numeric sales
df["Sales"] = pd.to_numeric(df["Sales"], errors="coerce")

# Remove invalid sales (zero or negative)
df = df[df["Sales"] > 0]

print(f"\nRecords after cleaning: {len(df)}")

# ---- AGGREGATE BY MONTH AND REGION ----
# This preserves regional breakdown for individual forecasting
monthly_regional_sales = (
    df
    .groupby([pd.Grouper(key="Date", freq="M"), "Region"])
    .agg({"Sales": "sum"})
    .reset_index()
)

# Rename Date column to Month for clarity
monthly_regional_sales.rename(columns={"Date": "Month"}, inplace=True)

print("\nMonthly regional sales sample:")
print(monthly_regional_sales.head(10))
print(f"\nTotal months per region: {monthly_regional_sales.groupby('Region').size()}")

# ---- SAVE FOR FORECASTING ----
monthly_regional_sales.to_csv(
    r"data/Processed/monthly_regional_sales.csv",
    index=False
)

print("\nProcessed file saved: data/Processed/monthly_regional_sales.csv")